Air temperature (C), Wind Speed (m/s), Wind Direction (degrees from; 0=from N, 90=from E, etc.)
#Read in the file list from folder
filelist <- list.files("2011-2021", full.names = TRUE)
filelist <- sort(filelist)
filelist
## [1] "2011-2021/chi2011.04t.avg.txt" "2011-2021/chi2012.04t.avg.txt"
## [3] "2011-2021/chi2013.04t.avg.txt" "2011-2021/chi2014.04t.avg.txt"
## [5] "2011-2021/chi2015.04t.avg.txt" "2011-2021/chi2016.04t.avg.txt"
## [7] "2011-2021/chi2017.04t.avg.txt" "2011-2021/chi2018.04t.avg.txt"
## [9] "2011-2021/chi2019.04t.avg.txt" "2011-2021/chi2020.04t.avg.txt"
## [11] "2011-2021/chi2021.04t.avg.txt"
#Initialize the first file for appending
r <- read.csv(filelist[1], sep="", header = TRUE)
head(r, 10)
## DOY WS WD AT n
## 1 1 11.89 234 1.34 288
## 2 2 9.66 250 -8.07 288
## 3 3 8.11 214 -2.25 288
## 4 4 7.93 257 -2.40 288
## 5 5 5.41 236 -6.81 288
## 6 6 7.74 288 -2.97 288
## 7 7 6.85 295 -7.22 288
## 8 8 9.71 310 -8.74 288
## 9 9 5.05 312 -6.08 288
## 10 10 3.46 144 -3.00 250
#Loop to read in all files
for (i in 2:length(filelist)){
ri <- suppressMessages(read.csv(filelist[i], sep=""))
r <- rbind(r, ri)
}
df <- r
df$Date = seq(as.Date("2011-01-01"),as.Date("2011-01-01") + dim(r)[1] - 1,by = 1)
df$Month <- as.numeric(format(df$Date,'%m'))
df$Year <- as.numeric(format(df$Date,'%Y'))
tail(df)
## DOY WS WD AT n Date Month Year
## 4001 360 4.32 44 3.70 719 2021-12-14 12 2021
## 4002 361 9.58 127 4.48 719 2021-12-15 12 2021
## 4003 362 7.18 114 1.93 719 2021-12-16 12 2021
## 4004 363 5.93 288 2.11 719 2021-12-17 12 2021
## 4005 364 4.87 95 1.38 719 2021-12-18 12 2021
## 4006 365 4.81 180 2.89 715 2021-12-19 12 2021
#As TS
ggplot(df, aes(x = DOY, y = WS, color = Month)) +
geom_line()
#Distribution of all
hist(df$WS, main = 'Histogram of Wind Speed (m/s)', xlab = 'Wind Speed (m/s)')
#Box plot by Month
boxplot(WS~Month,df, ylab = 'Wind Speed (m/s)')
### Wind Direction
#WD visualization
windrose(speed = df$WS,
direction =df$WD,
speed_cuts = seq(0,25,5),
ggtheme='minimal')
#As TS
ggplot(df, aes(x = DOY, y = WD, color = Month)) +
geom_line()
#General Distribution
hist(df$WD, main = 'Histogram of Wind Direction', xlab = 'Wind Direction')
#Box plot by Month
boxplot(WD~Month,df, ylab = 'Wind Direction')
#As TS
ggplot(df, aes(x = DOY, y = AT, color = Month)) +
geom_line()
#General Distribution
hist(df$AT, main = 'Histogram of Air Tempreture', xlab = 'Air Tempreture')
#Box plot by Month
boxplot(AT~Month,df, ylab = 'Air Tempreture')
We see obvious outliers from the line plot and histogram on 2019 May.
#qplots
qplot(WD,WS,data=df,color=Month,geom = c("point","smooth"),facets = .~Month)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
qplot(AT,WS,data=df,color=Month,geom = c("point","smooth"),facets = .~Month)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(df$WS,df$AT)
## [1] -0.1935277
cor(df$WS,df$WD)
## [1] 0.1970362
A value of 0.21, -0.22 shows there is a positive correlation between two variables, but it is weak and likely unimportant.
We wanted to explore wind speed change monthly, hence we aggregate our data into monthly averages.
df.agg <- df %>% group_by(Year, Month) %>% summarize(m_ws = mean(WS), m_at = mean(AT), m_wd = mean(WD))
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
#Rounding
df.agg$m_ws <- round(df.agg$m_ws, digits = 5)
df.agg$m_at <- round(df.agg$m_at, digits = 5)
df.agg$m_wd <- round(df.agg$m_wd, digits = 5)
#Reset row index
df.agg$Order <- 1:nrow(df.agg)
df.agg
## # A tibble: 132 × 6
## # Groups: Year [11]
## Year Month m_ws m_at m_wd Order
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2011 1 7.18 -5.03 239. 1
## 2 2011 2 8.34 -2.38 192. 2
## 3 2011 3 6.97 1.91 138 3
## 4 2011 4 7.97 7.60 177. 4
## 5 2011 5 7.75 12.2 189. 5
## 6 2011 6 6.24 18.7 201. 6
## 7 2011 7 4.81 24.2 169. 7
## 8 2011 8 5.68 22.9 163. 8
## 9 2011 9 7.37 16.4 180. 9
## 10 2011 10 7.33 13.3 202. 10
## # … with 122 more rows
#As TS
ggplot(df.agg, aes(x = Order, y = m_ws, color = Month)) +
geom_line()
#Distribution of all
hist(df.agg$m_ws, main = 'Histogram of Wind Speed (m/s)', xlab = 'Wind Speed (m/s)')
#Box plot by Month
boxplot(m_ws~Month,df.agg, ylab = 'Wind Speed (m/s)')
### Wind Direction
windrose(speed = df.agg$m_ws,
direction =df.agg$m_wd,
speed_cuts = seq(0,10,2),
ggtheme='minimal')
#As TS
ggplot(df.agg, aes(x = Order, y = m_wd, color = Month)) +
geom_line()
#General Distribution
hist(df.agg$m_wd, main = 'Histogram of Wind Direction', xlab = 'Wind Direction')
#Box plot by Month
boxplot(m_wd~Month,df.agg, ylab = 'Wind Direction')
### Air Tempreture
#As TS
ggplot(df.agg, aes(x = Order, y = m_at, color = Month)) +
geom_line()
#General Distribution
hist(df.agg$m_at, main = 'Histogram of Air Tempreture', xlab = 'Wind Tempreture')
#Box plot by Month
boxplot(m_at~Month,df.agg, ylab = 'Wind Tempreture')
We see an extreme outlier in our air temperature dataset.
#Getting outliers at day to day level
at.outliers <- boxplot(df.agg$m_at, plot=FALSE)$out
df.agg[which(df.agg$m_at %in% at.outliers),]
## # A tibble: 1 × 6
## # Groups: Year [1]
## Year Month m_ws m_at m_wd Order
## <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 2019 5 5.93 -47.3 189. 101
#which(demo.data$score > quantile(demo.data$score)[4] + 1.5*IQR(demo.data$score)
We see the record from 2019 May that has an unusual air temperature at -47.25484.
hist(df[which(df$Year == 2019 & df$Month == 5),'AT'], xlab = 'Air Temperature', main= 'Daily Air Temperature from 2019 May')
We see an unusual pattern of distribution which have much data points at -99 air temperature. We consider them corrupted data and hence replace the monthly air temperature from 2019 May with the average of May air temperature from other years.
#New temp as the average of May air temperature from other years
new_at <- mean(unlist(df.agg[which(df.agg$Month == 5 & df.agg$Year != 2019),'m_at']))
df.agg[which(df.agg$m_at %in% at.outliers),'m_at'] <- new_at
df.agg$m_at
## [1] -5.02968 -2.38464 1.91097 7.60433 12.23323 18.66533 24.18484 22.92194
## [9] 16.44300 13.26774 7.07967 2.05355 -0.05935 0.53517 9.82194 9.01167
## [17] 16.02065 21.93900 25.79710 22.61097 17.94200 10.78194 5.75400 1.90032
## [25] -2.52161 -2.22107 0.24000 7.40633 13.11452 17.66467 22.08000 22.19000
## [33] 19.30033 11.80742 3.57000 -4.33516 -7.68677 -7.24500 -0.23419 7.55100
## [41] 13.95194 17.87167 20.26839 21.77677 17.13667 11.47484 1.38967 0.70387
## [49] -4.28419 -7.77500 2.73258 8.01800 13.82194 17.17900 21.65871 21.53226
## [57] 19.63967 13.19484 7.97800 3.72935 -1.82000 -0.03172 6.43161 8.70633
## [65] 16.16548 21.56700 24.40774 24.11935 20.64033 14.77323 7.00100 -3.45355
## [73] 0.31968 4.75857 3.43581 10.74067 14.89000 21.90700 22.51903 21.24645
## [81] 20.79067 12.09387 5.40300 -5.37161 -1.50968 1.46250 1.67903 7.22100
## [89] 15.22806 20.38200 23.53161 23.98839 18.96967 10.40548 1.02833 1.67194
## [97] -5.25323 -2.81286 5.48710 8.50733 15.11629 19.77367 23.96355 21.83742
## [105] 18.42033 6.54484 2.41467 2.23613 -0.44774 0.88724 4.98323 7.31333
## [113] 17.01935 22.20033 23.84097 23.19097 16.75133 10.69387 5.18767 -0.13419
## [121] -3.66452 0.11571 8.14968 9.08567 18.71774 21.70367 23.86710 23.79516
## [129] 20.32833 11.95613 3.69700 3.91421
Checking the distribution of air temperature after taking out the outlier.
#As TS
ggplot(df.agg, aes(x = Order, y = m_at, color = Month)) +
geom_line()
#General Distribution
hist(df.agg$m_at, main = 'Histogram of Air Tempreture', xlab = 'Wind Tempreture')
#Box plot by Month
boxplot(m_at~Month,df.agg, ylab = 'Wind Tempreture')
We see the outlier is removed while maintaining the original shape of distribution.
#write.csv(r, 'all_days.csv')
write.csv(df.agg, 'agg.csv')